2023 Dev/GlowwormScaleComp.R

GlowwormScaleComp = function(Input, MetaData = Input@meta.data, ClassName, SubclassName = NA, Genes, GlowwormGenesOutput = NA, PercentCutoff = 0.10, Linearity = "Auto", Verbose = T, DataSlot, CellNorm, SecondaryGenes = 100, PopScale =T){
  
  if(any(Genes == names(GlowwormGenesOutput))){GeneList = GlowwormGenesOutput[[Genes]]}else if(is.na(GlowwormGenesOutput)){GeneList = Genes}else{stop("If using GlowwormGenes to determine gene inputs the GlowwormGenesOutput parameter should be set to the output of GlowwormGenes, and the Genes parameter should indicate 'UniqueGenesinWindow' or 'NearestNeighborGene'")
  }
  if(length(unique(GeneList))== 0){stop("No genes in input - check results of GlowwormGenesOutput")}  
  
  MetaData <- dplyr::rename(MetaData, Class = all_of(ClassName))
  if(is.na(SubclassName)){
    MetaDataComb = MetaData %>% dplyr::select(Class)
  }else{
    MetaData <- dplyr::rename(MetaData, Subclass = all_of(SubclassName))
    MetaDataComb = MetaData %>% dplyr::select(Class, Subclass)
  }
  if(class(Input) =="Seurat"){
    PullMeta = Input@meta.data %>% dplyr::select(nFeature_RNA, nCount_RNA)
    MetaDataComb = merge(MetaDataComb, PullMeta, by =0)
    row.names(MetaDataComb) = MetaDataComb$Row.names
    MetaDataComb$Row.names = NULL
  }
  
  OutputList = list()
  OutputList[["MetaData"]] = MetaDataComb
  if(class(Input) =="Seurat"){
    GetGenes = subset(GeneList, GeneList %in% row.names(Input@assays$RNA@counts))
    GenesNotIn = subset(GeneList, ! GeneList %in% row.names(Input@assays$RNA@counts))
    if(class(SecondaryGenes) == "character"){
      SecondaryGenes2 = subset(SecondaryGenes, SecondaryGenes %in% row.names(Input@assays$RNA@counts))  
      SecGenes = subset(row.names(Input@assays$RNA@counts), row.names(Input@assays$RNA@counts) %in% SecondaryGenes2 & ! row.names(Input@assays$RNA@counts) %in% GetGenes)
      BackgroundType = "selected"
    }else if(class(SecondaryGenes) %in% c("numeric", "integer") & SecondaryGenes > 0){
      MinClustSize = as.data.frame(table(MetaData$Subclass))
      MinClustSize = MinClustSize[order(MinClustSize$Freq), ]
      ExpByGene = as.data.frame(rowSums(Input@assays$RNA@counts != 0))
      colnames(ExpByGene) = "Ncells"
      ExpGenes = subset(ExpByGene, ExpByGene$Ncells > MinClustSize[[2]][[1]])
      RestGenes = subset(row.names(ExpGenes), ! row.names(ExpGenes) %in% GeneList)
      NSecondary = SecondaryGenes
      set.seed(123)
      SecGenes = sort(sample(RestGenes, NSecondary, replace=F)) 
      BackgroundType = "randomly generated"
    }else if(length(SecondaryGenes) == 0){stop("SecondaryGenes should contain either the number of randomly selected background genes to use genes, or a string of genes to use as secondary genes. At least an equal number is recommended, but default is set to 100")}
    
    DefaultAssay(Input) = "RNA"
    Subset_Data = FetchData(Input, vars = c(GetGenes, SecGenes), cells = row.names(MetaData), slot = DataSlot)
    Subset_Data2 = merge(Subset_Data, PullMeta, by = 0)
    row.names(Subset_Data2)  = Subset_Data2$Row.names
    Subset_Data2$Row.names = NULL
    #row.names(Subset_Data2) = Subset_Data2$Row.names
    #Subset_Data2$Row.names = NULL
    #SeuFile_Reduced = subset(Input, cells = row.names(MetaDataComb))
    #GetGenes = subset(GeneList, GeneList %in% row.names(SeuFile_Reduced@assays$RNA@counts))
    #Subset_Data = SeuFile_Reduced@assays$RNA@counts[GetGenes,  ]        #Genes as row names                
  }else if("dgCMatrix" == class(Input)){
    Input = as.matrix(Input)
    GetGenes = subset(GeneList, GeneList %in% row.names(Input))
    GetBarcs = subset(MetaData, row.names(MetaDataComb) %in% colnames(Input))
    GetBarcs = row.names(GetBarcs)
    Subset_Data = Input[GetGenes, GetBarcs]      #Genes as row names     
  }else if("data.frame" == class(Input)){
    Input = as.matrix(Input)
    GetGenes = subset(GeneList, GeneList %in% row.names(Input))
    GetBarcs = subset(MetaData, row.names(MetaDataComb) %in% colnames(Input))
    GetBarcs = row.names(GetBarcs)
    Subset_Data = Input[GetGenes, GetBarcs]     #Genes as row names      
  }else if("matrix" == class(Input)[1]){
    GetGenes = subset(GeneList, GeneList %in% row.names(Input))
    GetBarcs = subset(MetaData, row.names(MetaDataComb) %in% colnames(Input))
    GetBarcs = row.names(GetBarcs)
    Subset_Data = Input[GetGenes, GetBarcs]        #Genes as row names   
  }else{stop("Input file needs to be a Seurat File, or a matrix, sparse matrix or data frame with genes as row names and barcodes as column names")}
  
  Settings = list(#"InputFile" = as.character(substitute(Input)),
    "SecondaryGenes" = SecGenes,
    "N_Classes" = length(unique(MetaDataComb$Class)),
    "N_Subclasses" = length(unique(MetaDataComb$Subclass)))
  
  if(CellNorm == "Count"){
    Subset_Data2$nFeature_RNA = NULL
    Subset_Data2 = Subset_Data2/Subset_Data2$nCount_RNA
    Subset_Data2$nCount_RNA = NULL
    # Subset_Data = as.data.frame(t(Subset_Data2))
  }else if(CellNorm == "Feat"){
    Subset_Data2$nCount_RNA = NULL
    Subset_Data2 = Subset_Data2/Subset_Data2$nFeature_RNA 
    Subset_Data2$nFeature_RNA = NULL
    Subset_Data = as.data.frame(t(Subset_Data2))
  }else if(CellNorm == "Ratio"){
    Subset_Data2$Ratio = Subset_Data2$nCount_RNA/Subset_Data2$nFeature_RNA
    Subset_Data2$nCount_RNA = NULL; Subset_Data2$nFeature_RNA = NULL
    Subset_Data2 = Subset_Data2/Subset_Data2$Ratio 
    Subset_Data2$Ratio = NULL
    Subset_Data = as.data.frame(t(Subset_Data2))    
  }else if(CellNorm == "None"){
    Subset_Data2$nCount_RNA = NULL; Subset_Data2$nFeature_RNA = NULL
    Subset_Data = as.data.frame(t(Subset_Data2))    
  }
  
  
  if(Verbose == T){cat("| ##         | Preprocessing complete")}
  #row.names(Subset_Data2) = Subset_Data2$Row.names; Subset_Data2$Row.names = NULL
  DataMatrix_Zscore = as.data.frame(sapply(Subset_Data2, function(Subset_Data2) (Subset_Data2-mean(Subset_Data2))/sd(Subset_Data2)))
  row.names(DataMatrix_Zscore) = row.names(Subset_Data2)
  DataMatrix_Zscore[is.na(DataMatrix_Zscore)] = 0
  DataMatrix_Zscore_t = as.data.frame(t(DataMatrix_Zscore))
  if(Verbose == T){cat("\n| ####       | Scaling complete")}
  
  if(is.na(SubclassName) == F){
    MetaDataComb$Combined = paste(MetaDataComb[["Subclass"]], MetaDataComb[["Class"]], sep="^")
    MetaDataComb[["Subclass"]] = NULL
    MetaDataComb[["Class"]] = NULL
  }else{
    MetaDataComb$Combined = paste(MetaDataComb[["Class"]], MetaDataComb[["Class"]], sep="^")
    MetaDataComb[["Class"]] = NULL
  }
  
  Settings[["N_TotalClusters"]] = length(unique(MetaDataComb$Combined))
  Settings[["PercentCutoff"]] = PercentCutoff
  
  DataMatrix_Zscore_wAssigns = merge(DataMatrix_Zscore, MetaDataComb, by = 0)
  DataMatrix_Zscore_wAssigns$nFeature_RNA = NULL
  DataMatrix_Zscore_wAssigns$nCount_RNA = NULL
  row.names(DataMatrix_Zscore_wAssigns) = DataMatrix_Zscore_wAssigns$Row.names
  DataMatrix_Zscore_wAssigns$Row.names = NULL
  DataMatrix_Zscore_wAssigns[DataMatrix_Zscore_wAssigns <= 0] <- 0
  if(Verbose == T){cat("\n| ######     | Metadata filtering complete")}
  
  ScreenedData = as.data.frame(matrix(ncol = length(GetGenes)+1, nrow = 0))
  colnames(ScreenedData) = c(GetGenes, "Combined")
  
  ForLong  =  DataMatrix_Zscore_wAssigns
  ForLong$Combined = NULL
  
  DataLong = melt(ForLong)
  DataLong$Combined = DataMatrix_Zscore_wAssigns$Combined
  DataLong$Barcs = row.names(ForLong)
  DataLong$Count = 1
  DataLong$Binary = ifelse(DataLong$value > 0, 1, 0)
  DataLong$GenePop = paste(DataLong$variable, DataLong$Combined)
  
  GetCutoffs = DataLong %>% group_by(variable, Combined) %>% dplyr::summarise("PC_exp" = sum(Binary)/sum(Count))
  PassedCutoff = subset(GetCutoffs, GetCutoffs$PC_exp >= PercentCutoff)
  PassedCutoff$GenePop = paste(PassedCutoff$variable, PassedCutoff$Combined)
  
  DataLong$Data = ifelse(! DataLong$GenePop %in% PassedCutoff$GenePop, 0, DataLong$value) ##Changed JUN14
  DataLong$GenePop = NULL
  DataLong$Binary = NULL
  DataLong$Count = NULL
  
  if(PopScale == T){
    OutputList[["GlowwormScaleAllCells_Target"]] = subset(DataLong, DataLong$variable %in% GetGenes)
    OutputList[["GlowwormScaleAllCells_Secondary"]] = subset(DataLong, DataLong$variable %in% SecGenes)    
  }else if(PopScale == F){
    ZScoreLong = gather(DataMatrix_Zscore, "variable", "Data")
    ZScoreLong$Barcs = row.names(DataMatrix_Zscore)
    ZScoreLong = merge(ZScoreLong, MetaDataComb, by.x = "Barcs", by.y = 0)   
    ZScoreLong$nFeature_RNA = NULL;  ZScoreLong$nCount_RNA = NULL
    OutputList[["GlowwormScaleAllCells_Target"]] = subset(ZScoreLong, ZScoreLong$variable %in% GetGenes)
    OutputList[["GlowwormScaleAllCells_Secondary"]] = subset(ZScoreLong, ZScoreLong$variable %in% SecGenes)
  }
  
  #  aqm <- melt(DataLong, id=c("Combined", "variable" , "Data"), na.rm=TRUE)
  #  suppressMessages({
  #  ScreenedData <- dcast(aqm, Combined~variable,mean,id.vars = NULL) ##Error
  # })
  ScreenedData <- dcast(DataLong, Barcs~variable, value.var = "Data")
  ScreenedData = merge(ScreenedData, MetaDataComb, by.x = "Barcs", by.y = 0)
  row.names(ScreenedData) = ScreenedData$Barcs
  ScreenedData$Barcs = NULL
  ScreenedData$nFeature_RNA = NULL
  ScreenedData$nCount_RNA = NULL
  #OutputList[["GlowwormScaleAllCells_Target"]] = ScreenedData %>% dplyr::select(GetGenes, Combined)
  #OutputList[["GlowwormScaleAllCells_Secondary"]] = ScreenedData %>% dplyr::select(SecondaryGenes, Combined)  
  
  
  if(Verbose == T){cat("\n| ########   | Background correction complete")}
  DataMatrix_Zscore_Reduced = ScreenedData %>% group_by(Combined) %>% summarise_all(mean, na.rm = TRUE)
  DataMatrix_Zscore_Reduced = as.data.frame(DataMatrix_Zscore_Reduced)
  row.names(DataMatrix_Zscore_Reduced) = DataMatrix_Zscore_Reduced$Combined
  DataMatrix_Zscore_Reduced$Combined = NULL
  DataMatrix_Zscore_Reduced[DataMatrix_Zscore_Reduced < 0] <- 0
  DataMatrix_Zscore_Reduced$Subclass = gsub("\\^.*", "", row.names(DataMatrix_Zscore_Reduced))
  DataMatrix_Zscore_Reduced$Class = gsub(".*\\^", "", row.names(DataMatrix_Zscore_Reduced))
  OutputList[["GlowwormScaleOutput"]] = DataMatrix_Zscore_Reduced
  OutputList[["GlowwormScaleOutput_Target"]] = DataMatrix_Zscore_Reduced %>% dplyr::select(GetGenes)
  OutputList[["GlowwormScaleOutput_Secondary"]] = DataMatrix_Zscore_Reduced %>% dplyr::select(SecGenes)
  DataMatrix_Zscore_Reduced$Subclass = NULL
  DataMatrix_Zscore_Reduced$Class = NULL
  GlowwormScaleOutput_t = as.data.frame(t(DataMatrix_Zscore_Reduced))
  SumStats = as.data.frame(rowSums(GlowwormScaleOutput_t))
  colnames(SumStats) = "Total"
  SumStats$StDev = rowSds(as.matrix(GlowwormScaleOutput_t))
  
  
  ##
  Binary = DataMatrix_Zscore_Reduced
  Binary = as.data.frame(ifelse(Binary > 0, 1, 0))
  Binary$Class = gsub(".*\\^", "", row.names(Binary))
  
  Pull = Binary %>% group_by(Class) %>% summarise_all(sum)
  if(dim(Pull)[1] > 1){
    Pull = as.data.frame(Pull)
    row.names(Pull) = Pull$Class
    Pull$Class = NULL
    Pull2 = apply(Pull, 2, function(x){x/sum(x)})
    Pull3 = as.data.frame(colMaxs(Pull2))
    row.names(Pull3) = colnames(Pull)
    colnames(Pull3) = "FreqPerc"
    Pull3[is.na(Pull3)] = 0
    SumStats = merge(SumStats, Pull3, by=0)
    row.names(SumStats) = SumStats$Row.names
    SumStats$Row.names = NULL
    Pull2 = as.data.frame(Pull2)
    Pull4 = as.data.frame(ifelse(Pull2 > 0, 1, 0))
    Pull5 = as.data.frame(colSums(Pull4))
    
    SumStats$SD = SumStats$FreqPerc + SumStats$StDev
    SumStats2 = subset(SumStats, SumStats$Total > 0)
  }else{
    SumStats$SD = SumStats$StDev
    SumStats2 = subset(SumStats, SumStats$Total > 0)
    Pull4 = as.data.frame(Pull)
    row.names(Pull4) = Pull4$Class
    Pull4$Class = NULL
    Pull5 = as.data.frame(t(ifelse(Pull4 > 0, 1, 0)))
    
  }
  
  
  #if(length(Linearity) == 2){
  #    Expression = Linearity[[1]]
  #    Specificity = Linearity[[2]]
  #  }else if(length(Linearity) == 1){
  #    Expression = Linearity
  #    Specificity = Linearity
  #  }else if(length(Linearity) > 2){stop("Linearity needs to be either 'Auto', or a single transfomation (to be applied to both Expression and Specificitydata) or a vector of two values, with the first applied to Expression data and the second to Specificity data")}
  
  #for(ES in c("Expression", "Specificity")){
  #if(ES == "Expression"){
  #  ESCol = "Total"
  #  Transformation = Expression
  #  bc = boxcox(Total~1, data= SumStats2, plotit = F)}
  #  else if(ES == "Specificity"){
  #    ESCol = "SD"
  #      Transformation = Specificity
  #    bc = boxcox(SD~1, data= SumStats2, plotit = F)}
  #lambda <- round_any(bc$x[which.max(bc$y)], 0.5)
  
  #if(Transformation == "Auto"){
  
  # if(lambda == 0){
  #  SumStats[[ESCol]] = log(SumStats[[ESCol]]+1)  
  #  Settings[[paste(ES, "Transformation", sep="_")]] = "Auto: Log"
  #  if(Verbose == T){cat(paste("\n             |", ES, "autocorrected by log transformation"))}
  #  }else if(lambda == 0.5 | lambda == -0.5){
  #  SumStats[[ESCol]] = sqrt(SumStats[[ESCol]])  
  #  Settings[[paste(ES, "Transformation", sep="_")]] = "Auto: Square root"
  #  if(Verbose == T){cat(paste("\n             |", ES, "autocorrected by square root transformation"))}
  #  #}else if(lambda == -0.5){
  #  #SumStats[[ESCol]] = 1/sqrt(SumStats[[ESCol]])  
  #  #Settings[[paste(ES, "Transformation", sep="_")]] = "Auto: Inverse square root"
  #  # if(Verbose == T){cat(paste("\n             |", ES, "autocorrected by inverse square root transformation"))}
  #  #}else if(lambda == -1){
  #  #SumStats[[ESCol]] = 1/SumStats[[ESCol]]
  #  #Settings[[paste(ES, "Transformation", sep="_")]] = "Auto: Inversion"
  #  #if(Verbose == T){cat(paste("\n             |", ES, "autocorrected by inversion"))}
  #  }else if(lambda == 1 | lambda == -1){
  #  Settings[[paste(ES, "Transformation", sep="_")]] = "Auto: No autocorrection"
  #  if(Verbose == T){cat(paste("\n             | No autocorrection found for", ES, "parameter"))}
  #  }else if(lambda > 1 | lambda < 1){
  #  SumStats[[ESCol]] = SumStats[[ESCol]]^lambda
  #  Settings[[paste(ES, "Transformation", sep="_")]] = paste("Auto: x^", lambda)
  #  if(Verbose == T){cat(paste("\n             |", ES, "autocorrected by raising to the power of", lambda))}
  #  #}else if(lambda < 1){
  #  #SumStats[[ESCol]] = 1/SumStats[[ESCol]]^lambda
  #  #Settings[[paste(ES, "Transformation", sep="_")]] = paste("Auto: 1/x^", lambda)
  #  #if(Verbose == T){cat(paste("\n             |", ES, "autocorrected by raising to the power of", lambda, "then inversion"))}
  #  }
  #  }else if(length(grep("\\^", Transformation, value = T)) > 0){ #For Exponent
  #  Transformation = as.numeric(gsub("\\^", "", Transformation))  
  #  SumStats[[ESCol]] = SumStats[[ESCol]] ^Transformation
  #  Settings[[paste(ES, "Transformation", sep="_")]] = paste("x^", Transformation)
  #  }else if(length(grep("og", Transformation, value = T)) > 0){ #For Log
  #  Transformation = as.numeric(gsub("Log", "",gsub("log", "", Transformation)))
  #  if(Transformation == ""){
  #  SumStats[[ESCol]] = log(SumStats[[ESCol]]+1)
  #  Settings[[paste(ES, "Transformation", sep="_")]] = "Log"
  #  }else{
  #  SumStats[[ESCol]] = log(SumStats[[ESCol]]+1, Transformation)
  #  Settings[[paste(ES, "Transformation", sep="_")]] = paste("Log base", Transformation)
  #  }}else if(is.numeric(Transformation) == T & Transformation > 0 & ! Transformation %in% c(0,1) | is.numeric(Transformation) == T & Transformation < 0){ 
  #  Expression = as.numeric(Transformation)  
  #  SumStats[[ESCol]] = SumStats[[ESCol]] * Transformation
  #  Settings[[paste(ES, "Transformation", sep="_")]] = paste("Multiplied by", Transformation)
  #  }else if(is.numeric(Transformation) == T & Transformation %in% c(0,1) |  is.na(Transformation) == T){
  #  Settings[[paste(ES, "Transformation", sep="_")]] = paste("No correction for", ES)    
  #  }
  #}
  
  SumStats[SumStats==Inf] <- 0
  
  SumStats_KeyGenes = subset(SumStats, row.names(SumStats) %in% GetGenes)
  SumStats_SecondaryGenes = subset(SumStats, row.names(SumStats) %in% SecGenes)
  
  SumStats_KeyGenes$Total = (SumStats_KeyGenes$Total- min(SumStats_KeyGenes$Total))/(max(SumStats_KeyGenes$Total) - min(SumStats_KeyGenes$Total))
  SumStats_KeyGenes$SD = (SumStats_KeyGenes$SD- min(SumStats_KeyGenes$SD))/(max(SumStats_KeyGenes$SD) - min(SumStats_KeyGenes$SD))
  SumStats_KeyGenes$RankScore = SumStats_KeyGenes$Total + SumStats_KeyGenes$SD
  SumStats_KeyGenes = SumStats_KeyGenes[order(-SumStats_KeyGenes$RankScore),]
  SpecCut = subset(Pull5, Pull5[[1]] >= 1)
  SumStats3 = subset(SumStats_KeyGenes, row.names(SumStats_KeyGenes) %in% row.names(SpecCut))
  OutputList[["SumStats"]] = SumStats3
  
  Settings[["InputGenes"]] = GeneList
  Settings[["MappedGenes"]] = row.names(SumStats_KeyGenes)
  Settings[["GenesNotFound"]] = subset(GeneList, ! GeneList %in% row.names(SumStats_KeyGenes))
  OutputList[["SumStats"]] = SumStats_KeyGenes
  colnames(Pull5) = "Specificity"
  Pull5[is.na(Pull5)] = 0
  OutputList[["SpecificityScore"]] = Pull5
  OutputList[["Settings"]] = Settings
  
  setClass("GlowwormObj", representation(MetaData = "data.frame", GlowwormScaleOutput = "data.frame", GlowwormScaleOutput_Secondary = "data.frame",GlowwormScaleAllCells = "data.frame", GlowwormScaleAllCells_Secondary = "data.frame",  SumStats = "data.frame", SpecificityScore = "data.frame", Settings = "list"))
  
  
  Output = new("GlowwormObj", 
               MetaData = OutputList[["MetaData"]], 
               GlowwormScaleOutput = OutputList[["GlowwormScaleOutput_Target"]], 
               GlowwormScaleOutput_Secondary = OutputList[["GlowwormScaleOutput_Secondary"]], 
               GlowwormScaleAllCells = OutputList[["GlowwormScaleAllCells_Target"]], 
               GlowwormScaleAllCells_Secondary = OutputList[["GlowwormScaleAllCells_Secondary"]], 
               SumStats = OutputList[["SumStats"]], 
               SpecificityScore = OutputList[["SpecificityScore"]], 
               Settings = OutputList[["Settings"]])
  
  
  # OutputList[["GlowwormScaleAllCells_Secondary"]]
  
  if(Verbose == T){cat(paste("\n| ########## | GlowwormScale complete\nFrom", length(GeneList), " input genes, ", length(GetGenes), " genes were processed.", length(GenesNotIn), " genes were not in input data.\n", length(SecondaryGenes), " ", BackgroundType, "background genes were used"), sep="")}
  return(Output)
}
Hannahglover/Glowworm documentation built on Jan. 16, 2024, 11:47 p.m.